setwd("/Users/jaeheehwang/Documents/gelman/")
# Import Data
library(plotly)
## Warning: package 'plotly' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(package = "lattice")
library(ggmap)
## 
## Attaching package: 'ggmap'
## 
## The following object is masked from 'package:plotly':
## 
##     wind
library(source.gist)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
library(rworldmap)
## Warning: package 'rworldmap' was built under R version 3.2.3
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library("Hmisc")
## Warning: package 'Hmisc' was built under R version 3.2.3
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:plotly':
## 
##     subplot
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(base)

dat <- read.csv ("cleanwithGDP.csv",header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
datc <- dat
variable.names(datc)
##  [1] "registerNum"       "country1"          "country2"         
##  [4] "dateBegan"         "dateEnded"         "durationDays"     
##  [7] "peopleDead"        "peopleDisplaced"   "damageUsd"        
## [10] "mainCause1"        "mainCause2"        "mainCause3"       
## [13] "severity"          "affectedSqKm"      "magnitude"        
## [16] "centroidX"         "centroidY"         "GDPmil"           
## [19] "GDPrank"           "Income.Group.OECD" "Region"           
## [22] "Land.Area"
########################################################################
#### AREA AFFECTED
# Let's see how the distribution of affected sqkm range. 
areaInterval <- cut2(datc$affectedSqKm, c(10000,50000,100000,10000000))
table(areaInterval)
## areaInterval
## [1.2e+01,1.0e+04) [1.0e+04,5.0e+04) [5.0e+04,1.0e+05) [1.0e+05,1.0e+07] 
##              1196              1395               622              1099
# Let's divide affected area into small, medium, large, or very large intervals. 
datc$affected <- ifelse(datc$affectedSqKm < 10000, "Small", ifelse(datc$affectedSqKm < 50000, "Medium", ifelse(datc$affectedSqKm < 100000, "Large", "Very Large")))
datc$affected = factor(datc$affected,levels=c("Small", "Medium", "Large", "Very Large"), ordered = TRUE) 
table(datc$affected)  
## 
##      Small     Medium      Large Very Large 
##       1196       1395        622       1099
########################################################################
#### People Dead
table(cut2(datc$peopleDead, g = 10))
## 
##           0           1 [ 2,     4) [ 4,     6) [ 6,    11) [11,    19) 
##        1231         235         420         298         435         405 
## [19,    36) [36,    88) [88,160000] 
##         444         413         431
deadInterval <- cut2(datc$peopleDead, c(10,50,100,10000000))
table(deadInterval)
## deadInterval
## [0e+00,1e+01) [1e+01,5e+01) [5e+01,1e+02) [1e+02,1e+07] 
##          2549          1093           282           388
# Let's divide people dead into intervals
datc$dead <- ifelse(datc$peopleDead < 10, "Under 10", ifelse(datc$peopleDead < 50, "10-50", ifelse(datc$peopleDead < 100, "50-100", "Over 100")))
datc$dead = factor(datc$dead,levels=c("Under 10", "10-50","50-100","Over 100"), ordered = TRUE) 
table(datc$dead)  
## 
## Under 10    10-50   50-100 Over 100 
##     2549     1093      282      388
########################################################################
#### People displaced
table(cut2(datc$peopleDisplaced, g = 10))
## 
##                 0 [     2,      21) [    21,     406) [   406,    1600) 
##              1278                19               465               414 
## [  1600,    4080) [  4080,   10308) [ 10308,   31000) [ 31000,  150000) 
##               461               389               426               429 
## [150000,40000000] 
##               431
displacedInterval <- cut2(datc$peopleDisplaced, c(30,5000,100000,100000000))
table(displacedInterval)
## displacedInterval
## [0e+00,3e+01) [3e+01,5e+03) [5e+03,1e+05) [1e+05,1e+08] 
##          1306          1356          1114           536
# Let's divide people displaced into intervals
datc$displaced <- ifelse(datc$peopleDisplaced < 30, "Under 30", ifelse(datc$peopleDisplaced < 5000, "30-5000", ifelse(datc$peopleDisplaced < 100000, "5000-100000", "Over 100000")))
table(datc$displaced)  
## 
##     30-5000 5000-100000 Over 100000    Under 30 
##        1356        1114         536        1306
datc$displaced = factor(datc$displaced,levels=c("Under 30", "30-5000","5000-100000","Over 100000"), ordered = TRUE) 


########################################################################
#### Duration
table(cut2(datc$durationDays, g = 10))
## 
## [ 1,  3)        3        4        5        6 [ 7,  9) [ 9, 11) [11, 16) 
##      587      573      486      427      262      435      289      444 
## [16, 26) [26,419] 
##      398      411
durationInterval <- cut2(datc$durationDays, c(5, 10, 20, 500))
table(durationInterval)
## durationInterval
## [  1,  5) [  5, 10) [ 10, 20) [ 20,500] 
##      1646      1278       776       612
# Let's divide duration into intervals
datc$duration <- ifelse(datc$durationDays < 5, "Under 5", ifelse(datc$durationDays < 10, "5-10", ifelse(datc$durationDays < 20, "10-20", "Over 20")))
table(datc$duration)  
## 
##   10-20    5-10 Over 20 Under 5 
##     776    1278     612    1646
datc$duration = factor(datc$duration,levels=c("Under 5", "5-10","10-20","Over 20"), ordered = TRUE) 

#########################################################################
table(datc$Income.Group.OECD)
## 
##                      High income: nonOECD    High income: OECD 
##                  100                  203                  990 
##           Low income  Lower middle income  Upper middle income 
##                  452                 1357                 1210
datc$incomeGroup <- datc$Income.Group.OECD
datc$incomeGroup = factor(datc$incomeGroup, levels = c("Low income", "Lower middle income", "Upper middle income","High income: nonOECD", "High income: OECD"), ordered = TRUE)
table(datc$incomeGroup)
## 
##           Low income  Lower middle income  Upper middle income 
##                  452                 1357                 1210 
## High income: nonOECD    High income: OECD 
##                  203                  990
datc$GDPlevel <- ifelse(datc$Income.Group.OECD == "High income: OECD", 5, 
                        ifelse(datc$Income.Group.OECD == "High income: nonOECD", 4,
                               ifelse(datc$Income.Group.OECD == "Upper middle income", 3,
                                      ifelse(datc$Income.Group.OECD == "Lower middle income", 2, 1))))
table(datc$GDPlevel)
## 
##    1    2    3    4    5 
##  552 1357 1210  203  990
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#########################################################################
#plots


#GDP rank by region
#North America has high GDP. Sub-saharan africa seems to have low GDP. 
bwplot(~datc$GDPrank|factor(datc$Region),data=datc,col="blue",
       main="GDP rank by region")

#income group by region
#North America has many high income group countries, whereas south asia and sub-saharan africa has many low-income countries. 
bwplot(~datc$incomeGroup|factor(datc$Region),data=datc,col="blue",
       main="income group by region")

# People displacement and country income group. 
dotplot(datc$peopleDisplaced ~ datc$magnitude | datc$incomeGroup, data = datc)

dotplot(datc$displaced ~ datc$magnitude | datc$incomeGroup, data = datc)

# People dead and country income group. 
dotplot(datc$peopleDead ~ datc$magnitude | datc$incomeGroup, data = datc)

dotplot(datc$dead ~ datc$magnitude | datc$incomeGroup, data = datc)

# Magnitude and affected square km are positively correlated in a S shaped curve. 
# dotplot(datc$affectedSqKm ~ datc$magnitude | datc$Region, data = datc)
dotplot(datc$affected ~ datc$magnitude | datc$Region, data = datc)

# In North America Europe, Middle East and North Africa, 
# less people were likely to be dead regardless of magnitude and squaremeter affected
# In South Africa, more people were likely to be dead when affected squaremeter was larger.  
dotplot(datc$affectedSqKm ~ datc$dead | datc$Region, data = datc) 

dotplot(datc$affectedSqKm ~ datc$displaced | datc$Region, data = datc)

dotplot(datc$magnitude ~ datc$dead | datc$Region, data = datc)

dotplot(datc$magnitude ~ datc$displaced | datc$Region, data = datc)

dotplot(datc$magnitude ~ datc$dead | datc$incomeGroup, data = datc)

dotplot(datc$magnitude ~ datc$displaced | datc$incomeGroup, data = datc)

# For high income countries, less people died regardless of affected squaremeter or magnitude. 
# For less than high income countries, more people were prone to die at higher squarementer and magnitude. 
dotplot(datc$affectedSqKm ~ datc$dead | datc$incomeGroup, data = datc)

dotplot(datc$affectedSqKm ~ datc$displaced | datc$incomeGroup, data = datc)

bwplot(~datc$displaced|factor(datc$incomeGroup),data=datc,col="blue",
       main="Displacement by Country")

bwplot(~datc$dead|factor(datc$incomeGroup),data=datc,col="blue",
       main="Number Dead by Country")

dotplot(datc$dead ~ datc$GDPrank | datc$Region, data = datc) 

dotplot(datc$affected ~ datc$GDPrank | datc$Region, data = datc)

#magnitude and affected area by region
#monsoons seem to bring high magnitude floods. 
bwplot(~datc$magnitude|factor(datc$mainCause1),data=datc,col="blue",
       main="magnitude by main cause")

#monsoons seem to affect large regions
#landslides seem to affect small regions
#tsunami seems to affect wide array of regions. 
bwplot(~datc$affected|factor(datc$mainCause1),data=datc,col="blue",
       main="affected area by main cause")

# tsunamis seem to kill many people. 
#snow/icemelt seem to usually not kill many. 
bwplot(~datc$dead|factor(datc$mainCause1),data=datc,col="blue",
       main="death by main cause")

#hurricanes and monsoons seem to displace a lot of people. 
#landslide and avalanches seem to now displace many. 
bwplot(~datc$displaced|factor(datc$mainCause1),data=datc,col="blue",
       main="displacement by main cause")

dotplot(datc$damageUsd ~ datc$incomeGroup | datc$Region, data = datc) 

#main causes by region. 
#stacked histogram. diff lengths. 
#East Asia seems to have predominantly many floods. 
hist <- ggplot(datc, aes(x = datc$Region, fill = datc$mainCause1))
hist + geom_bar(color = "black", aes(fill = datc$mainCause1, position = "fill"))+
  labs(title = "Main Causes by Region", x = "Region", y = "Percent") + 
  scale_fill_discrete(name="Main Causes") + guides(fill = guide_legend(reverse=TRUE))

#equal height
#Main cause seems to be Heavy Rain, except for South Asia where Monsoon also seems to be a leading cause. 
#East Asia & Pacific also had some Trophical Storms. 
hist2 <- ggplot(datc, aes(x = datc$Region))
hist + geom_bar(color = "black", aes(fill = datc$mainCause1), position = "fill")+
  labs(title = "Main Causes by Region", x = "Region", y = "Percent") + 
  scale_fill_discrete(name="Main Causes") + guides(fill = guide_legend(reverse=TRUE))

###########################################################################################
# other ideas: radar chart? 
# http://www.cmap.polytechnique.fr/~lepennec/R/Radar/RadarAndParallelPlots.html
# can we throw in a bubble graph?
###########################################################################################
#mapping

library(plotly)
library(ggplot2)
library(package = "lattice")
library(ggmap)
library(source.gist)
library(rworldmap)

dat <- read.csv ("cleanwithGDP.csv",header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
datc <- dat
variable.names(datc)
##  [1] "registerNum"       "country1"          "country2"         
##  [4] "dateBegan"         "dateEnded"         "durationDays"     
##  [7] "peopleDead"        "peopleDisplaced"   "damageUsd"        
## [10] "mainCause1"        "mainCause2"        "mainCause3"       
## [13] "severity"          "affectedSqKm"      "magnitude"        
## [16] "centroidX"         "centroidY"         "GDPmil"           
## [19] "GDPrank"           "Income.Group.OECD" "Region"           
## [22] "Land.Area"
library(rworldmap)

library(maps)
## Warning: package 'maps' was built under R version 3.2.3
## 
##  # ATTENTION: maps v3.0 has an updated 'world' map.        #
##  # Many country borders and names have changed since 1990. #
##  # Type '?world' or 'news(package="maps")'. See README_v3. #
library(ggplot2)
world_map <- map_data("world")
p <- ggplot() + coord_fixed() + xlab("") + ylab("")

base_world_messy <- p + geom_polygon(data=world_map, aes(x=long, y=lat, group=group), 
                                     colour="light green", fill="light green")

base_world_messy

cleanup <- 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        panel.background = element_rect(fill = 'white', colour = 'white'), 
        axis.line = element_line(colour = "white"), legend.position="none",
        axis.ticks=element_blank(), axis.text.x=element_blank(),
        axis.text.y=element_blank())

datc1 <- datc[1:500,]

base_world <- base_world_messy + cleanup

map_data <- 
  base_world +
  geom_point(data=datc1, 
             aes(x=centroidX, y=centroidY), colour="Deep Pink", 
             fill="Pink",pch=21, size=5, alpha=I(0.7))
variable.names(datc1)
##  [1] "registerNum"       "country1"          "country2"         
##  [4] "dateBegan"         "dateEnded"         "durationDays"     
##  [7] "peopleDead"        "peopleDisplaced"   "damageUsd"        
## [10] "mainCause1"        "mainCause2"        "mainCause3"       
## [13] "severity"          "affectedSqKm"      "magnitude"        
## [16] "centroidX"         "centroidY"         "GDPmil"           
## [19] "GDPrank"           "Income.Group.OECD" "Region"           
## [22] "Land.Area"
datc1$Income.Group <- datc1$Income.Group.OECD 
#map_data_coloured <- base_world +
#  geom_point(data=datc1, 
#             aes(x=centroidX, y=centroidY, colour=datc$Income.Group), size=3, alpha=I(0.7))

#map_data_coloured

#flood causes and affected sqkm
map_data_sized <- 
  base_world +
  geom_point(data=datc1, 
             aes(x=centroidX, y=centroidY, colour=mainCause1, size=affectedSqKm), alpha=I(0.7)) 

map_data_sized

#magnitude and affectedsqkm
MagAndSqkm <- 
  base_world +
  geom_point(data=datc1, 
             aes(x=centroidX, y=centroidY, colour=magnitude, size=affectedSqKm), alpha=I(0.7)) 
MagAndSqkm

map_data_sized <- 
  base_world +
  geom_point(data=datc1, 
             aes(x=centroidX, y=centroidY, colour=mainCause1, size=damageUsd), alpha=I(0.7)) 

map_data_sized

###########################################################################################
###########################################################################################

###########################################################################################
# other ideas: radar chart? 
# http://www.cmap.polytechnique.fr/~lepennec/R/Radar/RadarAndParallelPlots.html
# can we throw in a bubble graph?
###########################################################################################